import scrapy
import re
from newstju.items import NewsItem

class NewsscrapySpider(scrapy.Spider):
    name = "newsScrapy"
    allowed_domains = ["news.tju.edu.cn"]
    # TODO url
    start_urls = []
    # start_urls = ["https://news.tju.edu.cn/xnxw1/qb.htm"]

    def parse(self, response):
        articles = response.xpath("//*[@id='pic']/li")
        for idx, article in enumerate(articles):
            if idx == 0:
                continue
            item = NewsItem()
            # 解析网站URL
            article_url = article.xpath('hgroup/h4/a/@href').extract_first()
            # 解析图片地址
            article_img = article.xpath('a/img/@src').extract_first()
            article_time = article.xpath('hgroup/h5/text()').extract_first()
            match = re.search(r'(\d+/.*)', article_url)
            if not match:
                continue

            article_url = 'https://news.tju.edu.cn/info/' + match.group(1)
            item['img_path'] = article_img
            item['url'] = article_url
            item['time'] = article_time

            # 访问上面的url，解析文章内容
            yield scrapy.Request(article_url, meta={'item': item}, callback=self.parse_article)
        
        # 解析下一页的url
        nextUrl = response.xpath('//a[contains(text(), "下页")]/@href').extract_first()
        print("nextUrl:", nextUrl)
        # 如果url有这个内容，说明还有下一页，继续爬取
        if (nextUrl.find('1020.htm') != -1):
            return
        else:
            if nextUrl.find('qb') == -1:
                nextUrl = 'https://news.tju.edu.cn/xnxw1/qb/' + nextUrl
            else:
                nextUrl = 'https://news.tju.edu.cn/xnxw1/' + nextUrl
            yield scrapy.Request(nextUrl, callback=self.parse, dont_filter=True)
            

    def parse_article(self, response):
        # 处理文章页面响应
        item = response.meta['item']
        title = response.xpath('//*[@id="fx_article_title"]/text()').extract_first()
        abstract = response.xpath('//*[@id="vsb_content_500"]/div/p[1]/text()').extract_first()
        views = response.xpath('//p[@class=contentTime]/span[contains(text(), "dynclicks")]/text()').extract_first()
        editor = response.xpath('//*[@id="vsb_content_500"]/div/p[11]/text()').extract_first()
        item['title'] = title
        item['abstract'] = abstract
        item['views'] = views
        item['editor'] = editor
        
        # 返回数据
        yield item  # 返回item，将提取的数据传递给pipeline进行处理
